Biostat 212a Homework 2

Due Feb 8, 2025 @ 11:59PM

Author

Wenqiang Ge UID:106371961

Published

February 5, 2025

1 ISL Exercise 4.8.1 (10pts)


Solution:

2 ISL Exercise 4.8.6 (10pts)


Solution:

3 ISL Exercise 4.8.9 (10pts)


Solution:

4 ISL Exercise 4.8.13 (a)-(i) (50pts)


Solution:

library(ISLR2)
library(MASS)

data("Weekly")
# Structure of the dataset
str(Weekly)
'data.frame':   1089 obs. of  9 variables:
 $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
 $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
 $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
 $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
 $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
 $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
 $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
 $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
 $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...

(a)

# Numerical summary
summary(Weekly)
      Year           Lag1               Lag2               Lag3         
 Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
 1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
 Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
 Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
 3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
 Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
      Lag4               Lag5              Volume            Today         
 Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
 1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
 Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
 Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
 3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
 Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
 Direction 
 Down:484  
 Up  :605  
           
           
           
           
# Plot the Volume over time
plot(Weekly$Year, Weekly$Volume, main="Trading Volume Over Time", xlab="Year", ylab="Volume", col="blue", pch=20)

# Boxplot of market return (Today) by Direction
boxplot(Today ~ Direction, data=Weekly, main="Market Return by Direction", ylab="Today’s Return", col=c("red", "green"))

# Correlation matrix (excluding categorical variables)
cor(Weekly[, -9])  # Exclude the Direction column
              Year         Lag1        Lag2        Lag3         Lag4
Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
               Lag5      Volume        Today
Year   -0.030519101  0.84194162 -0.032459894
Lag1   -0.008183096 -0.06495131 -0.075031842
Lag2   -0.072499482 -0.08551314  0.059166717
Lag3    0.060657175 -0.06928771 -0.071243639
Lag4   -0.075675027 -0.06107462 -0.007825873
Lag5    1.000000000 -0.05851741  0.011012698
Volume -0.058517414  1.00000000 -0.033077783
Today   0.011012698 -0.03307778  1.000000000

(b)

# Fit logistic regression model
logistic_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, 
                      data = Weekly, family = binomial)

# Summary of the logistic regression model
summary(logistic_model)

Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
    Volume, family = binomial, data = Weekly)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)  0.26686    0.08593   3.106   0.0019 **
Lag1        -0.04127    0.02641  -1.563   0.1181   
Lag2         0.05844    0.02686   2.175   0.0296 * 
Lag3        -0.01606    0.02666  -0.602   0.5469   
Lag4        -0.02779    0.02646  -1.050   0.2937   
Lag5        -0.01447    0.02638  -0.549   0.5833   
Volume      -0.02274    0.03690  -0.616   0.5377   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1496.2  on 1088  degrees of freedom
Residual deviance: 1486.4  on 1082  degrees of freedom
AIC: 1500.4

Number of Fisher Scoring iterations: 4

\(log(\frac{P(Up)}{1-P(Up)})\) = 0.26686−0.04127 × Lag1+0.05844 × Lag2−0.01066 × Lag3−0.02779 × Lag4−0.01447 × Lag5−0.02274 × Volume.

Yes, the p-values of Lag1 and Lag3 are less than 0.05, so they are statistically significant.

(c)

# Predict probabilities
pred_probs <- predict(logistic_model, type="response")

# Convert probabilities to class predictions (threshold = 0.5)
pred_classes <- ifelse(pred_probs > 0.5, "Up", "Down")

# Create confusion matrix
conf_matrix <- table(Predicted = pred_classes, Actual = Weekly$Direction)

# Compute accuracy
accuracy <- mean(pred_classes == Weekly$Direction)

# Print results
print(conf_matrix)
         Actual
Predicted Down  Up
     Down   54  48
     Up    430 557
print(paste("Overall accuracy:", round(accuracy, 4)))
[1] "Overall accuracy: 0.5611"

True Positives (TP) = 557 ; False Positives (FP) = 430 ; True Negatives (TN) = 54 ; False Negatives (FN) = 48

Accuracy= \(\frac{TP+TN}{Total Samples} = \frac{557+54}{54+48+430+557}=0.5611\) . This is only slightly better than random guessing (50%).

The model is biased towards predicting “Up”, as indicated by the large number of false positives (FP = 430). The model fails to predict “Down” accurately, with only 54 correct “Down” predictions out of 484 actual “Down” instances.

(d)

# Split the dataset
train <- Weekly$Year < 2009
train_data <- Weekly[train, ]
test_data <- Weekly[!train, ]

# Fit logistic regression using Lag2
logistic_model_lag2 <- glm(Direction ~ Lag2, data=train_data, family=binomial)

# Predict on test data
test_probs <- predict(logistic_model_lag2, newdata=test_data, type="response")

# Convert probabilities to class labels
test_preds <- ifelse(test_probs > 0.5, "Up", "Down")

# Compute confusion matrix
conf_matrix_test <- table(Predicted = test_preds, Actual = test_data$Direction)

# Compute accuracy
test_accuracy <- mean(test_preds == test_data$Direction)

# Print results
print(conf_matrix_test)
         Actual
Predicted Down Up
     Down    9  5
     Up     34 56
print(paste("Test accuracy:", round(test_accuracy, 4)))
[1] "Test accuracy: 0.625"

(e)

# Fit LDA model
lda_model <- lda(Direction ~ Lag2, data=train_data)

# Predict on test data
lda_preds <- predict(lda_model, newdata=test_data)

# Extract class predictions
lda_classes <- lda_preds$class

# Create confusion matrix
conf_matrix_lda <- table(Predicted = lda_classes, Actual = test_data$Direction)

# Compute accuracy
lda_accuracy <- mean(lda_classes == test_data$Direction)

# Print results
print(conf_matrix_lda)
         Actual
Predicted Down Up
     Down    9  5
     Up     34 56
print(paste("LDA test accuracy:", round(lda_accuracy, 4)))
[1] "LDA test accuracy: 0.625"

(f)

# Fit QDA model
qda_model <- qda(Direction ~ Lag2, data=train_data)

# Predict on test data
qda_preds <- predict(qda_model, newdata=test_data)

# Extract class predictions
qda_classes <- qda_preds$class

# Compute confusion matrix
conf_matrix_qda <- table(Predicted = qda_classes, Actual = test_data$Direction)

# Compute accuracy
qda_accuracy <- mean(qda_classes == test_data$Direction)

# Print results
print(conf_matrix_qda)
         Actual
Predicted Down Up
     Down    0  0
     Up     43 61
print(paste("QDA test accuracy:", round(qda_accuracy, 4)))
[1] "QDA test accuracy: 0.5865"

(g)

library(class)

# Prepare training and test data
train_X <- train_data$Lag2
test_X <- test_data$Lag2
train_Y <- train_data$Direction
test_Y <- test_data$Direction

# Apply KNN with K=1
knn_preds <- knn(train = matrix(train_X), test = matrix(test_X), 
                 cl = train_Y, k = 1)

# Compute confusion matrix
conf_matrix_knn <- table(Predicted = knn_preds, Actual = test_Y)

# Compute accuracy
knn_accuracy <- mean(knn_preds == test_Y)

# Print results
print(conf_matrix_knn)
         Actual
Predicted Down Up
     Down   21 30
     Up     22 31
print(paste("KNN (K=1) test accuracy:", round(knn_accuracy, 4)))
[1] "KNN (K=1) test accuracy: 0.5"

(h)

library(e1071)

# Fit Naive Bayes model
nb_model <- naiveBayes(Direction ~ Lag2, data=train_data)

# Predict on test data
nb_preds <- predict(nb_model, newdata=test_data)

# Compute confusion matrix
conf_matrix_nb <- table(Predicted = nb_preds, Actual = test_data$Direction)

# Compute accuracy
nb_accuracy <- mean(nb_preds == test_data$Direction)

# Print results
print(conf_matrix_nb)
         Actual
Predicted Down Up
     Down    0  0
     Up     43 61
print(paste("Naive Bayes test accuracy:", round(nb_accuracy, 4)))
[1] "Naive Bayes test accuracy: 0.5865"

(i)

# Create a comparison table
model_comparison <- data.frame(
  Model = c("Logistic Regression", "LDA", "QDA", "KNN (K=1)", 
            "Naive Bayes"),
  Accuracy = c(test_accuracy, lda_accuracy, qda_accuracy, 
               knn_accuracy, nb_accuracy)
)

# Print comparison results
print(model_comparison)
                Model  Accuracy
1 Logistic Regression 0.6250000
2                 LDA 0.6250000
3                 QDA 0.5865385
4           KNN (K=1) 0.5000000
5         Naive Bayes 0.5865385

The Logistic Regression and LDA appear to have the best results on this data, and they both have 0.625 accuracy.

5 Bonus question: ISL Exercise 4.8.13 Part (j) (30pts)


Solution:

(j) Logistic Regression with multiple predictors

logistic_model_extended <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, 
                               data=train_data, family=binomial)

# Predictions
test_probs_extended <- predict(logistic_model_extended, newdata=test_data, type="response")
test_preds_extended <- ifelse(test_probs_extended > 0.5, "Up", "Down")

# Confusion Matrix
conf_matrix_logistic_extended <- table(Predicted = test_preds_extended, Actual = test_data$Direction)
logistic_accuracy_extended <- mean(test_preds_extended == test_data$Direction)

print(conf_matrix_logistic_extended)
         Actual
Predicted Down Up
     Down   31 44
     Up     12 17
print(paste("Extended Logistic Regression Accuracy:", round(logistic_accuracy_extended, 4)))
[1] "Extended Logistic Regression Accuracy: 0.4615"

Logistic Regression with interaction terms

logistic_model_interaction <- glm(Direction ~ Lag2 * Volume, 
                                  data=train_data, family=binomial)

# Predictions
test_probs_interaction <- predict(logistic_model_interaction, 
                                  newdata=test_data, type="response")
test_preds_interaction <- ifelse(test_probs_interaction > 0.5, "Up", "Down")

# Confusion Matrix
conf_matrix_logistic_interaction <- table(Predicted = test_preds_interaction, 
                                          Actual = test_data$Direction)
logistic_accuracy_interaction <- mean(
  test_preds_interaction == test_data$Direction
  )

print(conf_matrix_logistic_interaction)
         Actual
Predicted Down Up
     Down   20 25
     Up     23 36
print(paste("Logistic Regression with Interaction Accuracy:", 
            round(logistic_accuracy_interaction, 4)))
[1] "Logistic Regression with Interaction Accuracy: 0.5385"

LDA with More Predictors

library(MASS)
lda_model_extended <- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, 
                          data=train_data)

# Predictions
lda_preds_extended <- predict(lda_model_extended, newdata=test_data)$class

# Confusion Matrix
conf_matrix_lda_extended <- table(Predicted = lda_preds_extended, Actual = 
                                    test_data$Direction)
lda_accuracy_extended <- mean(lda_preds_extended == test_data$Direction)

print(conf_matrix_lda_extended)
         Actual
Predicted Down Up
     Down   31 44
     Up     12 17
print(paste("Extended LDA Accuracy:", round(lda_accuracy_extended, 4)))
[1] "Extended LDA Accuracy: 0.4615"

QDA with More Predictors

qda_model_extended <- qda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, 
                          data=train_data)

# Predictions
qda_preds_extended <- predict(qda_model_extended, newdata=test_data)$class

# Confusion Matrix
conf_matrix_qda_extended <- table(Predicted = qda_preds_extended, Actual = 
                                    test_data$Direction)
qda_accuracy_extended <- mean(qda_preds_extended == test_data$Direction)

print(conf_matrix_qda_extended)
         Actual
Predicted Down Up
     Down   33 49
     Up     10 12
print(paste("Extended QDA Accuracy:", round(qda_accuracy_extended, 4)))
[1] "Extended QDA Accuracy: 0.4327"

Tuning K for KNN

library(class)

# Function to evaluate KNN for different K values
knn_evaluate <- function(k) {
  knn_preds <- knn(train=as.matrix(train_data[, c("Lag2")]), 
                   test=as.matrix(test_data[, c("Lag2")]), 
                   cl=train_data$Direction, k=k)
  
  conf_matrix_knn <- table(Predicted = knn_preds, Actual = test_data$Direction)
  knn_accuracy <- mean(knn_preds == test_data$Direction)
  
  return(list(conf_matrix=conf_matrix_knn, accuracy=knn_accuracy))
}

# Experiment with different values of K
knn_results <- lapply(c(1, 3, 5, 7, 10, 15, 20), knn_evaluate)

# Print results for each K
for (i in 1:length(knn_results)) {
  print(paste("KNN with K =", c(1, 3, 5, 7, 10, 15, 20)[i]))
  print(knn_results[[i]]$conf_matrix)
  print(paste("Accuracy:", round(knn_results[[i]]$accuracy, 4)))
}
[1] "KNN with K = 1"
         Actual
Predicted Down Up
     Down   21 30
     Up     22 31
[1] "Accuracy: 0.5"
[1] "KNN with K = 3"
         Actual
Predicted Down Up
     Down   16 19
     Up     27 42
[1] "Accuracy: 0.5577"
[1] "KNN with K = 5"
         Actual
Predicted Down Up
     Down   15 22
     Up     28 39
[1] "Accuracy: 0.5192"
[1] "KNN with K = 7"
         Actual
Predicted Down Up
     Down   16 19
     Up     27 42
[1] "Accuracy: 0.5577"
[1] "KNN with K = 10"
         Actual
Predicted Down Up
     Down   17 22
     Up     26 39
[1] "Accuracy: 0.5385"
[1] "KNN with K = 15"
         Actual
Predicted Down Up
     Down   20 20
     Up     23 41
[1] "Accuracy: 0.5865"
[1] "KNN with K = 20"
         Actual
Predicted Down Up
     Down   20 21
     Up     23 40
[1] "Accuracy: 0.5769"

Naive Bayes with More Predictors

library(e1071)
nb_model_extended <- naiveBayes(Direction ~ Lag1 + Lag2 + 
                                  Lag3 + Lag4 + Lag5 + Volume, data=train_data)

# Predictions
nb_preds_extended <- predict(nb_model_extended, newdata=test_data)

# Confusion Matrix
conf_matrix_nb_extended <- table(Predicted = nb_preds_extended, Actual = 
                                   test_data$Direction)
nb_accuracy_extended <- mean(nb_preds_extended == test_data$Direction)

print(conf_matrix_nb_extended)
         Actual
Predicted Down Up
     Down   42 56
     Up      1  5
print(paste("Extended Naive Bayes Accuracy:", round(nb_accuracy_extended, 4)))
[1] "Extended Naive Bayes Accuracy: 0.4519"

Comparing All Models

# Create a comparison table
model_comparison <- data.frame(
  Model = c("Logistic Regression", "logistic_model_extended", 
            "Logistic Regression (Interaction)", 
            "LDA", "LDA (Extended)", "QDA", "QDA (Extended)", 
            "KNN (K=1)","KNN (K=3)", "KNN (K=5)", "KNN (K=7)", 
            "KNN (K=10)", "KNN (K=15)", "KNN (K=20)", 
            "Naive Bayes", "Naive Bayes (Extended)"),
  Accuracy = c(test_accuracy, logistic_accuracy_extended, 
               logistic_accuracy_interaction, 
               lda_accuracy, lda_accuracy_extended, 
               qda_accuracy, qda_accuracy_extended, 
               knn_results[[1]]$accuracy, knn_results[[2]]$accuracy, 
               knn_results[[3]]$accuracy, knn_results[[4]]$accuracy, 
               knn_results[[5]]$accuracy, knn_results[[6]]$accuracy, 
               knn_results[[7]]$accuracy,
               nb_accuracy, nb_accuracy_extended)
)

# Print comparison results
print(model_comparison)
                               Model  Accuracy
1                Logistic Regression 0.6250000
2            logistic_model_extended 0.4615385
3  Logistic Regression (Interaction) 0.5384615
4                                LDA 0.6250000
5                     LDA (Extended) 0.4615385
6                                QDA 0.5865385
7                     QDA (Extended) 0.4326923
8                          KNN (K=1) 0.5000000
9                          KNN (K=3) 0.5576923
10                         KNN (K=5) 0.5192308
11                         KNN (K=7) 0.5576923
12                        KNN (K=10) 0.5384615
13                        KNN (K=15) 0.5865385
14                        KNN (K=20) 0.5769231
15                       Naive Bayes 0.5865385
16            Naive Bayes (Extended) 0.4519231

6 Bonus question: ISL Exercise 4.8.4 (30pts)


Solution:

(a)

(b)

(c)

(d)

(e)